from rank_bm25 import BM25Okapi

# 语料库
# corpus = [
#     "The cat sat on the mat.",
#     "The cat sat on 111 the mat.",
#     "The dog barked at the moon.",
#     "The sun is shining bright."
# ]

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

# 分词并去除标点符号
tokenized_corpus = [doc.lower().replace('.', '').split() for doc in corpus]
print(tokenized_corpus)

bm25 = BM25Okapi(tokenized_corpus)

# 输入查询
# query = "cat on mat"
query = "weather on London"
tokenized_query = query.lower().split()
print(tokenized_query)
# 计算得分
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)